# GSS_makeDatasetForImputation.R

# Part of the replication archive for 
#
#   Bullock, John G. 2020. "Education and Attitudes toward Redistribution in
#   the United States." British Journal of Political Science 50.



##############################################################################
# PRELIMINARIES
##############################################################################
library(Bullock, lib.loc = c(.libPaths(), 'packageLibrary'))  # for merge_fac
library(car)      # for Recode()
library(dplyr)    # for %>%
library(forcats)  # for fct_relevel()
library(haven)    # for read_spss() 
library(stringr)  # for str_pad()

# LOAD CUMULATIVE GSS
GSS_cumulative_filename <- 'GSS7218_R1.DTA'
if (! file.exists(paste0("data/", GSS_cumulative_filename))) {
  GSS_cumulative <- tempfile(fileext = '.zip')
  download.file(
    url      = 'http://www.gss.norc.org/documents/stata/GSS_stata.zip', 
    destfile = GSS_cumulative)
  unzip(GSS_cumulative, GSS_cumulative_filename, exdir = 'data')
}
GSS <- read_dta(paste0("data/", GSS_cumulative_filename)) %>%
  select(
    year,
    reg16,
    educ,
    degree,
    paeduc,
    maeduc,
    padeg,
    madeg,
    cohort,
    id,
    age,
    sex,
    race,
    born,
    eqwlth,
    goveqinc,
    eqincome,
    helppoor,
    natfare,
    marital,
    realrinc,
    incdef,
    prestige,
    papres16,
    papres80,
    goodlife,
    starts_with('pay'),
    starts_with('giv'),
    getahead,
    welfare1,
    wordsum,    
    occ,
    occ80,
    
    wrkstat,
    wrkslf,
    income,
    rincome,
    prestg80,
    hompop,
    
    # Gender attitudes
    fehome,
    fework,
    fepres,
    fepol,
    fechld,
    fepresch,
    fefam,
    
    # Racial attitudes
    racmar,
    raclive,
    rachome,
    busing,
    racpres,
    racdif1,
    racdif2,
    racdif3,
    racdif4,
    helpblk,
      
    # Spending attitudes
    natspac,
    natenvir,
    natheal,
    natcity,
    natcrime,
    natdrug,
    nateduc,
    natrace,
    nataid,
    natroad,
    natsoc,
    natmass,
    natpark,
    
    # Tolerance  
    spkath,
    colath,
    libath,
    spkrac,
    colrac,
    librac,
    spkcom,
    colcom,
    libcom,
    spkmil,
    colmil,
    libmil,
    spkhomo,
    colhomo,
    libhomo,
      
    # Other variables
    partyid,
    polviews,
    relig,
    reliten) 



##############################################################################
# MISSING-DATA EXPLORATION AND VARIABLE CODING
##############################################################################
# Code DK and NA as NA. But for the response variables, code "IAP" -- used for 
# people who were never assigned to be asked a question -- as 0. These cases 
# will be omitted from the analysis when GSS_createImputed.R is called.


# EQWLTH
tmp <- GSS$eqwlth
GSS$eqwlth[na_tag(GSS$eqwlth) == 'i'] <- 0

# WELFARE
GSS$welfare <- GSS$natfare
GSS$welfare[na_tag(GSS$natfare) == 'i'] <- 0

# HELPPOOR
GSS$helppoor[na_tag(GSS$helppoor) == 'i'] <- 0

# GOVEQINC
tmp.goveqinc <- as_factor(GSS$goveqinc, ordered = TRUE) %>%
  { Recode(., 'c("IAP", "NA")=NA; "CANT CHOOSE"="NEITHER"') } %>%
  fct_relevel('strongly agree', 'agree', 'neither', 'disagree', 'strongly disagree')

  tmp.eqincome <- ordered(
    GSS$eqincome, 
    levels = c('strongly agree', 'agree', 'neither', 'disagree', 'strongly disagree'))

  shouldBeZero <- na_tag(GSS$goveqinc)=='i' & na_tag(GSS$eqincome)=='i' 
  tmp.merged <- merge_fac(c('tmp.goveqinc', 'tmp.eqincome')) %>% as.integer()  
  tmp.merged[shouldBeZero] <- 0
  GSS$goveqinc <- tmp.merged 
  
# OTHER VARIABLES
yearInt    <- as.integer(GSS$year)
female     <- as_factor(GSS$sex) == 'female'
age        <- Recode(GSS$age, '98:99=NA') %>% as.integer()
educ       <- Recode(GSS$educ, '98:99=NA') %>% as.integer()
race       <- as_factor(GSS$race) %>%
  droplevels() %>%
  { Recode(., '"other"="otherRace"') } %>%
  relevel(., 'white')
bornInUS   <- Recode(as_factor(GSS$born), 'c("IAP", "DK", "NA")=NA') == 'yes'



##############################################################################
# CREATE DATA FRAMES FOR IMPUTATION 
##############################################################################
# This code block creates two data frames. The "reqd.vars" data frame contains
# all variables that must be in the imputed dataset. And the "cand.vars" 
# data frame contains variables that may help with the imputation.  


# REQUIRED VARIABLES
reqd.vars <- data.frame(
  respondentID = paste0(GSS$year, str_pad(GSS$id, width = 4, pad = '0')) %>% as.integer(),
  eqwlth       = GSS$eqwlth,
  goveqinc     = GSS$goveqinc,
  welfare      = GSS$welfare,
  helppoor     = GSS$helppoor,
  yearInt, 
  female,
  age,
  # age.sq.100,
  educ,
  race,
  bornInUS)

# OTHER CANDIDATE VARIABLES
c2f <- function (x) { as_factor(x) %>% droplevels() }  # convert to factor

cand.vars <- data.frame(
  
  # Demographics
  wrkstat  = c2f(GSS$wrkstat) %>% { Recode(., '"NA" = NA') },
  wrkslf   = c2f(GSS$wrkslf)  %>% { Recode(., 'c("IAP", "DK", "NA") = NA') },
  income   = c2f(GSS$income)  %>% { Recode(.,  'c("IAP", "refused", "dk", "na") = NA') } %>% unclass(),
  rincome  = c2f(GSS$rincome) %>% { Recode(.,  'c("IAP", "refused", "dk", "na") = NA') } %>% unclass(), 
  prestg80 = GSS$prestg80 %>% as.integer() %>% na_if(0),
  hompop   = GSS$hompop   %>% as.integer() %>% { Recode(., '98:99=NA') }, 
  
  # Gender attitudes
  fehome   = as.integer(GSS$fehome),
  fework   = as.integer(GSS$fework),
  fepres   = as.integer(GSS$fepres),
  fepol    = as.integer(GSS$fepol),
  fechld   = as.integer(GSS$fechld),
  fepresch = as.integer(GSS$fepresch),
  fefam    = as.integer(GSS$fefam),
  
  # Racial attitudes
  racmar   = c2f(GSS$racmar)  %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'yes',
  raclive  = c2f(GSS$raclive) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'yes',
  rachome  = c2f(GSS$rachome) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'yes',
  busing   = c2f(GSS$rachome) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'favor',
  racpres  = c2f(GSS$racpres) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'yes',
  racdif1  = c2f(GSS$racdif1) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'yes',
  racdif2  = c2f(GSS$racdif2) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'yes',
  racdif3  = c2f(GSS$racdif3) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'yes',
  racdif4  = c2f(GSS$racdif4) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'yes',
  helpblk  = Recode(as.integer(GSS$helpblk), 'c(0, 8, 9) = NA'),
    
  # Spending attitudes
  natspac  = Recode(unclass(GSS$natspac),  'c(0,8,9) = NA') - 1,
  natenvir = Recode(unclass(GSS$natenvir), 'c(0,8,9) = NA') - 1,
  natheal  = Recode(unclass(GSS$natheal),  'c(0,8,9) = NA') - 1,
  natcity  = Recode(unclass(GSS$natcity),  'c(0,8,9) = NA') - 1,
  natcrime = Recode(unclass(GSS$natcrime), 'c(0,8,9) = NA') - 1,
  natdrug  = Recode(unclass(GSS$natdrug),  'c(0,8,9) = NA') - 1,
  nateduc  = Recode(unclass(GSS$nateduc),  'c(0,8,9) = NA') - 1,
  natrace  = Recode(unclass(GSS$natrace),  'c(0,8,9) = NA') - 1,
  nataid   = Recode(unclass(GSS$nataid),   'c(0,8,9) = NA') - 1,
  natroad  = Recode(unclass(GSS$natroad),  'c(0,8,9) = NA') - 1,
  natsoc   = Recode(unclass(GSS$natsoc),   'c(0,8,9) = NA') - 1,
  natmass  = Recode(unclass(GSS$natmass),  'c(0,8,9) = NA') - 1,
  natpark  = Recode(unclass(GSS$natpark),  'c(0,8,9) = NA') - 1,
  
  # Tolerance  
  spkath   = c2f(GSS$spkath) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'allowed',
  colath   = c2f(GSS$colath) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'allowed',
  libath   = c2f(GSS$libath) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'not remove',
  spkrac   = c2f(GSS$spkrac) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'allowed',   
  colrac   = c2f(GSS$colrac) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'allowed',   
  librac   = c2f(GSS$librac) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'not remove',
  spkcom   = c2f(GSS$spkcom) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'allowed',   
  colcom   = c2f(GSS$colcom) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'allowed',   
  libcom   = c2f(GSS$libcom) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'not remove',
  spkmil   = c2f(GSS$spkmil) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'allowed',   
  colmil   = c2f(GSS$colmil) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'allowed',   
  libmil   = c2f(GSS$libmil) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'not remove',
  spkhomo  = c2f(GSS$spkhomo) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'allowed',   
  colhomo  = c2f(GSS$colhomo) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'allowed',   
  libhomo  = c2f(GSS$libhomo) %>% { Recode(., 'c("IAP", "DK", "NA") = NA') } == 'not remove',
    
  # Other varables
  partyid  = Recode(unclass(GSS$partyid),  '8:10 = NA'),
  polviews = Recode(unclass(GSS$polviews), 'c(1,9,10) = NA') - 1,
  relig    = c2f(GSS$relig) %>% { Recode(., '"NA" = NA; c("DK", "none")="DK or none"' ) },
  reliten  = rev(Recode(unclass(GSS$reliten), 'c(1,6,7) = NA') - 1))
  



####################################################################
# SAVE FILE FOR IMPUTATION
####################################################################
save(
  cand.vars, reqd.vars, 
  file = 'data/GSS_datasetForImputation.RData')

